# Scrapy settings for bilibili project
import random
from datetime import timedelta

# ================== 基础配置 ==================
BOT_NAME = "bilibili"
SPIDER_MODULES = ["bilibili.spiders"]
NEWSPIDER_MODULE = "bilibili.spiders"
FEED_EXPORT_ENCODING = "utf-8"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENTS = [

    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",

    # Firefox
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/115.0",

    # Safari
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",

    # Edge
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",

    # Opera
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 OPR/106.0.0.0",

    ]

# ================== 请求配置 ==================
DEFAULT_REQUEST_HEADERS = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Referer": "https://www.bilibili.com/",
    "Origin": "https://www.bilibili.com",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-site",
    "User-Agent": random.choice(USER_AGENTS)
}

# ================== 频率控制 ==================
CONCURRENT_REQUESTS = 1  # 严格单线程
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = random.uniform(5, 8)  # 5-8秒随机延迟
RANDOMIZE_DOWNLOAD_DELAY = True

# 自动限速 (保守策略)
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 45  # 最大延迟45秒
AUTOTHROTTLE_TARGET_CONCURRENCY = 0.2  # 极低并发目标
AUTOTHROTTLE_DEBUG = True  # 开启调试

# ================== 会话与重试 ==================
COOKIES_ENABLED = False  # API请求无需cookies
COOKIES_DEBUG = False

RETRY_ENABLED = True
RETRY_TIMES = 2  # 最多重试2次
RETRY_HTTP_CODES = [403, 429, 500, 502, 503, 504]
RETRY_PRIORITY_ADJUST = -1  # 降低重试优先级

# ================== 中间件配置 ==================
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400,
}

# ================== 其他优化 ==================
ROBOTSTXT_OBEY = False
TELNETCONSOLE_ENABLED = False
LOG_LEVEL = 'INFO'

# 自动关闭条件 (防止长时间运行)
CLOSESPIDER_TIMEOUT = 3600  # 1小时超时
CLOSESPIDER_ERRORCOUNT = 20  # 20次错误后停止

# ================== 扩展配置 ==================
EXTENSIONS = {
    'scrapy.extensions.closespider.CloseSpider': 500,
}